{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Classification of text documents using sparse features\n\n\nThis is an example showing how scikit-learn can be used to classify documents\nby topics using a bag-of-words approach. This example uses a scipy.sparse\nmatrix to store the features and demonstrates various classifiers that can\nefficiently handle sparse matrices.\n\nThe dataset used in this example is the 20 newsgroups dataset. It will be\nautomatically downloaded, then cached.\n\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>\n#         Olivier Grisel <olivier.grisel@ensta.org>\n#         Mathieu Blondel <mathieu@mblondel.org>\n#         Lars Buitinck\n# License: BSD 3 clause\nimport logging\nimport numpy as np\nfrom optparse import OptionParser\nimport sys\nfrom time import time\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_selection import SelectFromModel\nfrom sklearn.feature_selection import SelectKBest, chi2\nfrom sklearn.linear_model import RidgeClassifier\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.svm import LinearSVC\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.linear_model import Perceptron\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.neighbors import NearestCentroid\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.utils.extmath import density\nfrom sklearn import metrics\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO,\n                    format='%(asctime)s %(levelname)s %(message)s')\n\nop = OptionParser()\nop.add_option(\"--report\",\n              action=\"store_true\", dest=\"print_report\",\n              help=\"Print a detailed classification report.\")\nop.add_option(\"--chi2_select\",\n              action=\"store\", type=\"int\", dest=\"select_chi2\",\n              help=\"Select some number of features using a chi-squared test\")\nop.add_option(\"--confusion_matrix\",\n              action=\"store_true\", dest=\"print_cm\",\n              help=\"Print the confusion matrix.\")\nop.add_option(\"--top10\",\n              action=\"store_true\", dest=\"print_top10\",\n              help=\"Print ten most discriminative terms per class\"\n                   \" for every classifier.\")\nop.add_option(\"--all_categories\",\n              action=\"store_true\", dest=\"all_categories\",\n              help=\"Whether to use all categories or not.\")\nop.add_option(\"--use_hashing\",\n              action=\"store_true\",\n              help=\"Use a hashing vectorizer.\")\nop.add_option(\"--n_features\",\n              action=\"store\", type=int, default=2 ** 16,\n              help=\"n_features when using the hashing vectorizer.\")\nop.add_option(\"--filtered\",\n              action=\"store_true\",\n              help=\"Remove newsgroup information that is easily overfit: \"\n                   \"headers, signatures, and quoting.\")\n\n\ndef is_interactive():\n    return not hasattr(sys.modules['__main__'], '__file__')\n\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n    op.error(\"this script takes no arguments.\")\n    sys.exit(1)\n\nprint(__doc__)\nop.print_help()\nprint()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Load data from the training set\n------------------------------------\nLet's load data from the newsgroups dataset which comprises around 18000\nnewsgroups posts on 20 topics split in two subsets: one for training (or\ndevelopment) and the other one for testing (or for performance evaluation).\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "if opts.all_categories:\n    categories = None\nelse:\n    categories = [\n        'alt.atheism',\n        'talk.religion.misc',\n        'comp.graphics',\n        'sci.space',\n    ]\n\nif opts.filtered:\n    remove = ('headers', 'footers', 'quotes')\nelse:\n    remove = ()\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories if categories else \"all\")\n\ndata_train = fetch_20newsgroups(subset='train', categories=categories,\n                                shuffle=True, random_state=42,\n                                remove=remove)\n\ndata_test = fetch_20newsgroups(subset='test', categories=categories,\n                               shuffle=True, random_state=42,\n                               remove=remove)\nprint('data loaded')\n\n# order of labels in `target_names` can be different from `categories`\ntarget_names = data_train.target_names\n\n\ndef size_mb(docs):\n    return sum(len(s.encode('utf-8')) for s in docs) / 1e6\n\n\ndata_train_size_mb = size_mb(data_train.data)\ndata_test_size_mb = size_mb(data_test.data)\n\nprint(\"%d documents - %0.3fMB (training set)\" % (\n    len(data_train.data), data_train_size_mb))\nprint(\"%d documents - %0.3fMB (test set)\" % (\n    len(data_test.data), data_test_size_mb))\nprint(\"%d categories\" % len(target_names))\nprint()\n\n# split a training set and a test set\ny_train, y_test = data_train.target, data_test.target\n\nprint(\"Extracting features from the training data using a sparse vectorizer\")\nt0 = time()\nif opts.use_hashing:\n    vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,\n                                   n_features=opts.n_features)\n    X_train = vectorizer.transform(data_train.data)\nelse:\n    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,\n                                 stop_words='english')\n    X_train = vectorizer.fit_transform(data_train.data)\nduration = time() - t0\nprint(\"done in %fs at %0.3fMB/s\" % (duration, data_train_size_mb / duration))\nprint(\"n_samples: %d, n_features: %d\" % X_train.shape)\nprint()\n\nprint(\"Extracting features from the test data using the same vectorizer\")\nt0 = time()\nX_test = vectorizer.transform(data_test.data)\nduration = time() - t0\nprint(\"done in %fs at %0.3fMB/s\" % (duration, data_test_size_mb / duration))\nprint(\"n_samples: %d, n_features: %d\" % X_test.shape)\nprint()\n\n# mapping from integer feature name to original token string\nif opts.use_hashing:\n    feature_names = None\nelse:\n    feature_names = vectorizer.get_feature_names()\n\nif opts.select_chi2:\n    print(\"Extracting %d best features by a chi-squared test\" %\n          opts.select_chi2)\n    t0 = time()\n    ch2 = SelectKBest(chi2, k=opts.select_chi2)\n    X_train = ch2.fit_transform(X_train, y_train)\n    X_test = ch2.transform(X_test)\n    if feature_names:\n        # keep selected feature names\n        feature_names = [feature_names[i] for i\n                         in ch2.get_support(indices=True)]\n    print(\"done in %fs\" % (time() - t0))\n    print()\n\nif feature_names:\n    feature_names = np.asarray(feature_names)\n\n\ndef trim(s):\n    \"\"\"Trim string to fit on terminal (assuming 80-column display)\"\"\"\n    return s if len(s) <= 80 else s[:77] + \"...\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Benchmark classifiers\n------------------------------------\nWe train and test the datasets with 15 different classification models\nand get performance results for each model.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "def benchmark(clf):\n    print('_' * 80)\n    print(\"Training: \")\n    print(clf)\n    t0 = time()\n    clf.fit(X_train, y_train)\n    train_time = time() - t0\n    print(\"train time: %0.3fs\" % train_time)\n\n    t0 = time()\n    pred = clf.predict(X_test)\n    test_time = time() - t0\n    print(\"test time:  %0.3fs\" % test_time)\n\n    score = metrics.accuracy_score(y_test, pred)\n    print(\"accuracy:   %0.3f\" % score)\n\n    if hasattr(clf, 'coef_'):\n        print(\"dimensionality: %d\" % clf.coef_.shape[1])\n        print(\"density: %f\" % density(clf.coef_))\n\n        if opts.print_top10 and feature_names is not None:\n            print(\"top 10 keywords per class:\")\n            for i, label in enumerate(target_names):\n                top10 = np.argsort(clf.coef_[i])[-10:]\n                print(trim(\"%s: %s\" % (label, \" \".join(feature_names[top10]))))\n        print()\n\n    if opts.print_report:\n        print(\"classification report:\")\n        print(metrics.classification_report(y_test, pred,\n                                            target_names=target_names))\n\n    if opts.print_cm:\n        print(\"confusion matrix:\")\n        print(metrics.confusion_matrix(y_test, pred))\n\n    print()\n    clf_descr = str(clf).split('(')[0]\n    return clf_descr, score, train_time, test_time\n\n\nresults = []\nfor clf, name in (\n        (RidgeClassifier(tol=1e-2, solver=\"sag\"), \"Ridge Classifier\"),\n        (Perceptron(max_iter=50), \"Perceptron\"),\n        (PassiveAggressiveClassifier(max_iter=50),\n         \"Passive-Aggressive\"),\n        (KNeighborsClassifier(n_neighbors=10), \"kNN\"),\n        (RandomForestClassifier(), \"Random forest\")):\n    print('=' * 80)\n    print(name)\n    results.append(benchmark(clf))\n\nfor penalty in [\"l2\", \"l1\"]:\n    print('=' * 80)\n    print(\"%s penalty\" % penalty.upper())\n    # Train Liblinear model\n    results.append(benchmark(LinearSVC(penalty=penalty, dual=False,\n                                       tol=1e-3)))\n\n    # Train SGD model\n    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,\n                                           penalty=penalty)))\n\n# Train SGD with Elastic Net penalty\nprint('=' * 80)\nprint(\"Elastic-Net penalty\")\nresults.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,\n                                       penalty=\"elasticnet\")))\n\n# Train NearestCentroid without threshold\nprint('=' * 80)\nprint(\"NearestCentroid (aka Rocchio classifier)\")\nresults.append(benchmark(NearestCentroid()))\n\n# Train sparse Naive Bayes classifiers\nprint('=' * 80)\nprint(\"Naive Bayes\")\nresults.append(benchmark(MultinomialNB(alpha=.01)))\nresults.append(benchmark(BernoulliNB(alpha=.01)))\nresults.append(benchmark(ComplementNB(alpha=.1)))\n\nprint('=' * 80)\nprint(\"LinearSVC with L1-based feature selection\")\n# The smaller C, the stronger the regularization.\n# The more regularization, the more sparsity.\nresults.append(benchmark(Pipeline([\n  ('feature_selection', SelectFromModel(LinearSVC(penalty=\"l1\", dual=False,\n                                                  tol=1e-3))),\n  ('classification', LinearSVC(penalty=\"l2\"))])))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Add plots\n------------------------------------\nThe bar plot indicates the accuracy, training time (normalized) and test time\n(normalized) of each classifier.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "indices = np.arange(len(results))\n\nresults = [[x[i] for x in results] for i in range(4)]\n\nclf_names, score, training_time, test_time = results\ntraining_time = np.array(training_time) / np.max(training_time)\ntest_time = np.array(test_time) / np.max(test_time)\n\nplt.figure(figsize=(12, 8))\nplt.title(\"Score\")\nplt.barh(indices, score, .2, label=\"score\", color='navy')\nplt.barh(indices + .3, training_time, .2, label=\"training time\",\n         color='c')\nplt.barh(indices + .6, test_time, .2, label=\"test time\", color='darkorange')\nplt.yticks(())\nplt.legend(loc='best')\nplt.subplots_adjust(left=.25)\nplt.subplots_adjust(top=.95)\nplt.subplots_adjust(bottom=.05)\n\nfor i, c in zip(indices, clf_names):\n    plt.text(-.3, i, c)\n\nplt.show()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}